{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 字符串操作\n",
    "import numpy as np\n",
    "import pandas as pd\n",
    "from pandas import DataFrame\n",
    "from pandas import Series\n",
    "import re"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 字符串对象方法"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a', 'b', '  guido']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val = 'a,b,  guido'\n",
    "val.split(',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['a', 'b', 'guido']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pieces = [x.strip() for x in val.split(',')] # strip过滤字符串前后的不可见字符\n",
    "pieces"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'a::b::guido'"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "first, second, third = pieces\n",
    "first + '::' + second + '::' + third"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "'a::b::guido'"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'::'.join(pieces)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "True"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "'guido' in val # 判断sub string是否存在"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "1"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val.index(',') # 第一次出现的索引位置，与find不同，找不到就抛出异常。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "-1"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "val.find(':') # 找不到返回-1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "2"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    " val.count(',') # 计算某个字符出现次数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a::b::  guido\n",
      "ab  guido\n"
     ]
    }
   ],
   "source": [
    "print(val.replace(',', '::')) # 替换\n",
    "print(val.replace(',', ''))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# Python内置的字符串方法（自己google去查详细，懒得打了...）\n",
    "# count\n",
    "# startswith/endswith\n",
    "# join\n",
    "# index\n",
    "# find\n",
    "# rfind\n",
    "# replace\n",
    "# strip/lstrip/rstrip\n",
    "# split\n",
    "# lower/upper\n",
    "# ljust/rjust"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 正则表达式"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['foo', 'bar', 'baz', 'qux']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = \"foo   bar\\t baz  \\tqux\"\n",
    "re.split('\\s+', text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['foo', 'bar', 'baz', 'qux']"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regex = re.compile('\\s+')\n",
    "regex.split(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['   ', '\\t ', '  \\t']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regex.findall(text) # 找到所有匹配'\\s+'的内容"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['dave@google.com', 'steve@gmail.com', 'rob@gmail.com', 'ryan@yahoo.com']"
      ]
     },
     "execution_count": 17,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text = \"\"\"Dave dave@google.com\n",
    "Steve steve@gmail.com\n",
    "Rob rob@gmail.com\n",
    "Ryan ryan@yahoo.com\n",
    "\"\"\"\n",
    "pattern = r'[A-Z0-9._%+-]+@[A-Z0-9.-]+\\.[A-Z]{2,4}'\n",
    "regex = re.compile(pattern, flags=re.IGNORECASE) # 忽略大小写\n",
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 18,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "<_sre.SRE_Match object; span=(5, 20), match='dave@google.com'>\n"
     ]
    },
    {
     "data": {
      "text/plain": [
       "'dave@google.com'"
      ]
     },
     "execution_count": 18,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "m = regex.search(text)\n",
    "print(m)\n",
    "text[m.start():m.end()]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 19,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "None\n"
     ]
    }
   ],
   "source": [
    "print(regex.match(text)) # 返回None，因为它只匹配出现在字符串开头的模式。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 20,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dave REDACTED\n",
      "Steve REDACTED\n",
      "Rob REDACTED\n",
      "Ryan REDACTED\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(regex.sub('REDACTED', text)) # 匹配到的模式替换为指定字符串"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('wesm', 'bright', 'net')"
      ]
     },
     "execution_count": 21,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = r'([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\.([A-Z]{2,4})' # 用()包含group\n",
    "regex = re.compile(pattern, flags=re.IGNORECASE)\n",
    "m = regex.match('wesm@bright.net')\n",
    "m.groups()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "[('dave', 'google', 'com'),\n",
       " ('steve', 'gmail', 'com'),\n",
       " ('rob', 'gmail', 'com'),\n",
       " ('ryan', 'yahoo', 'com')]"
      ]
     },
     "execution_count": 22,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regex.findall(text)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Dave Username: dave, Domain: google, Suffix: com\n",
      "Steve Username: steve, Domain: gmail, Suffix: com\n",
      "Rob Username: rob, Domain: gmail, Suffix: com\n",
      "Ryan Username: ryan, Domain: yahoo, Suffix: com\n",
      "\n"
     ]
    }
   ],
   "source": [
    "print(regex.sub(r'Username: \\1, Domain: \\2, Suffix: \\3', text))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'domain': 'bright', 'suffix': 'net', 'username': 'wesm'}"
      ]
     },
     "execution_count": 24,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "regex = re.compile(r\"\"\"\n",
    "(?P<username>[A-Z0-9._%+-]+)\n",
    "@\n",
    "(?P<domain>[A-Z0-9.-]+)\n",
    "\\.\n",
    "(?P<suffix>[A-Z]{2,4})\"\"\", flags=re.IGNORECASE|re.VERBOSE)\n",
    "m = regex.match('wesm@bright.net')\n",
    "m.groupdict()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 正则表达式方法（不是重点，自己google...）\n",
    "# fundall/finditer\n",
    "# match\n",
    "# search\n",
    "# split\n",
    "# sub/subn\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# pandas中矢量化的字符串函数"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dave     False\n",
       "Rob       True\n",
       "Steve     True\n",
       "Wes        NaN\n",
       "dtype: object"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data = {'Dave': 'dave@google.com',\n",
    "        'Steve': 'steve@gmail.com',\n",
    "        'Rob': 'rob@gmail.com',\n",
    "        'Wes': np.nan}\n",
    "data = Series(data)\n",
    "data.str.contains('gmail')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dave     [(dave, google, com)]\n",
       "Rob        [(rob, gmail, com)]\n",
       "Steve    [(steve, gmail, com)]\n",
       "Wes                        NaN\n",
       "dtype: object"
      ]
     },
     "execution_count": 28,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "pattern = '([A-Z0-9._%+-]+)@([A-Z0-9.-]+)\\\\.([A-Z]{2,4})'\n",
    "data.str.findall(pattern, flags=re.IGNORECASE)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 29,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dave     True\n",
       "Rob      True\n",
       "Steve    True\n",
       "Wes       NaN\n",
       "dtype: object"
      ]
     },
     "execution_count": 29,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "matches = data.str.match(pattern, flags=re.IGNORECASE)\n",
    "matches # 原教材与现状不一致，matches反映每一个key是否匹配。"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 30,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Dave     dave@\n",
       "Rob      rob@g\n",
       "Steve    steve\n",
       "Wes        NaN\n",
       "dtype: object"
      ]
     },
     "execution_count": 30,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "data.str[:5] # 字符串统一切片"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 31,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "# 矢量化字符串方法（参考普通字符串...）\n",
    "# cat\n",
    "# contains\n",
    "# count\n",
    "# endswith/startswith\n",
    "# findall\n",
    "# get\n",
    "# join\n",
    "# len\n",
    "# lower/upper\n",
    "# match\n",
    "# pad\n",
    "# center\n",
    "# repeat\n",
    "# replace\n",
    "# slice\n",
    "# split\n",
    "# strip/lstrip/rstrip"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
